# -*- coding: utf-8 -*-
"""SPAP_2021_LSQ_TableS2_BERT.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1n8Y8Iq00GOBGGSfaTSPfiWbY8RXUzHik

**1. Install packages and mount drive**
"""

!pip install torch torchvision
!pip install transformers==2.10.0
!pip install seqeval
!pip install tensorboardx
!pip install simpletransformers==0.9.1

# Commented out IPython magic to ensure Python compatibility.
# %matplotlib inline

import pandas as pd
import numpy as np
from statistics import mean

import gc
import requests
import os

from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, recall_score, precision_score
from scipy.special import softmax

import random

import torch
print("Cuda available" if torch.cuda.is_available() is True else "CPU")
print("PyTorch version: ", torch.__version__)

from google.colab import drive
drive.mount('/content/drive')

"""**2. Read training data**"""

data_path = '/content/drive/MyDrive/spap_state/spap_state_attention/data/'
df = pd.read_csv(data_path + 'spap_state_attention_supplementary_train.csv')
df['text'] = np.where(df['quoted_status.full_text'].isnull(), df['full_text'], df['full_text'] + " " + df['quoted_status.full_text'])
df['covid'] = df['final_label'].astype(float)
df = df[['text','covid']]
print(len(df['covid']))
print(df['covid'].value_counts(normalize = True))

"""**3. Set seed for deterministic modeling**"""

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

"""**4. Performance metrics**"""

def report_results(A, B):
    A_name = A.name
    B_name = B.name
    
    df = pd.DataFrame({'A': A,
                       'B': B})
    df = df.dropna()
    A = df['A']
    B = df['B']
    
    prec = precision_score(B, A)
    rec = recall_score(B, A)
    f1 = f1_score(B, A)
    acc = accuracy_score(B, A)

    performance = [prec, rec, f1, acc]

    return performance

"""**5. 5-fold cross validation for BERT model**"""

# hyper-parameters

args = {
    'output_dir': 'outputs/',
   'cache_dir': 'cache/',

   'fp16': False,
   'fp16_opt_level': 'O1',
   'max_seq_length': 128,
   'train_batch_size': 8,
   'eval_batch_size': 8,
   'gradient_accumulation_steps': 1,
   'num_train_epochs': 2,
   'weight_decay': 0,
   'learning_rate': 4e-5,
   'adam_epsilon': 1e-8,
   'warmup_ratio': 0.06,
   'warmup_steps': 0,
   'max_grad_norm': 1.0,

   'logging_steps': 50,
   'evaluate_during_training': False,
   'save_steps': 2000,
   'eval_all_checkpoints': True,
   'use_tensorboard': True,

   'overwrite_output_dir': True,
   'reprocess_input_data': True
   }

# set seed number

set_seed(777)

# cross validate

kf = KFold(n_splits = 5, random_state = 777, shuffle = True)

cv_results = []

for train_index, val_index in kf.split(df):
  # splitting dataframe 
    train_df = df.iloc[train_index]
    val_df = df.iloc[val_index]
  # defining model
    model = ClassificationModel('bert', 'bert-base-uncased', args = args)
  # train model
    model.train_model(train_df)
  # validate model 
    result, model_outputs, wrong_predictions = model.eval_model(val_df)
    val_df['BERT_covid'] = np.argmax(model_outputs, axis = 1)
  # performance
    performance = report_results(val_df['BERT_covid'], val_df['covid'])
    print(performance)
    cv_results.append(performance) 

# report results '

df_cv_results = pd.DataFrame(cv_results, columns = ['precision', 'recall', 'f-1', 'accuracy'])
print(round(df_cv_results.mean(), 4))
print(round(df_cv_results.std(), 4))
